Preprocessing Dataset

Paths

Set the path of the dataframe file.

path_import = "data/original.csv"
path_export = "data/preprocessed.csv"

Libraries

Library for read dataset.

library(readr)

Library for data frames processing.

library(dplyr)
library(tidyr)

Library for R Markdown.

library(rmarkdown)
library(knitr)

Library for data presentation.

library(scales)

Library for manage strings.

library(stringr)

Load dataset

Load dataset.

df <- read_csv(path_import)

Statistics

Dimensions.

dim(df)
## [1] 183 447

Types

View witch types are in the dataset.

col_types_all <- 
  df %>% 
  sapply(typeof) %>% 
  unlist()

col_types_table <- 
  col_types_all %>% 
  table()

col_types <- 
  col_types_table %>% 
  as.vector()

names(col_types) <- names(col_types_table)
## character    double   logical 
##       158       204        85

As can be seen there are the three expected types: character, double and logical.

NA

Percentaje of NA values

Define function to see the amount of NA values in the dataframe.

percent_of_NA <- 
  function(df){
    num_of_NA <- 
      df %>% is.na() %>% sum()
    num_of_values <- 
      df %>% dim() %>% prod()
    percent_of_NA <- 
      (num_of_NA / num_of_values) %>% 
      percent()
    return(percent_of_NA)
  }
percent_of_NA(df)
## [1] "37%"

Columns with NA

Define functions to see the NA in columns.

num_of_NA_by_column <- 
  function(df){
    df %>% is.na() %>% colSums()
  }
remove_0 <- 
  function(x) x[x!=0]
names_of_colums_with_NA <- 
  function(df)
    df %>% 
      num_of_NA_by_column() %>% 
      remove_0 %>% 
      names()
percentaje_of_cols_with_NA <-
  function(df)
    (length(names_of_colums_with_NA(df)) / ncol(df)) %>% 
    percent()

Compute the percentaje of cols with NA.

percentaje_of_cols_with_NA(df)
## [1] "74%"

Inspect if there are columns full of NA.

is_full_of_NA <- function(col){
  num_of_NA <- 
    col %>% 
    is.na() %>% 
    sum()
  return(num_of_NA == length(col))
}
cols_full_of_NA <- 
  df %>% 
  select_if(is_full_of_NA) %>% 
  names()
##  [1] "authentihash"                  "scans.Bkav.result"            
##  [3] "scans.CMC.result"              "scans.ALYac.result"           
##  [5] "scans.Malwarebytes.result"     "scans.K7AntiVirus.result"     
##  [7] "scans.Baidu.result"            "scans.SUPERAntiSpyware.result"
##  [9] "scans.Gridinsoft.result"       "scans.ViRobot.result"         
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"         
## [13] "scans.VBA32.result"            "scans.Zoner.result"           
## [15] "scans.Panda.result"            "scans.Elastic.result"         
## [17] "scans.Cylance.result"          "scans.SentinelOne.result"

As can be seen there are many columns that are full of NA, so can be deleted.

df <- 
  select(df, -all_of(cols_full_of_NA))

Colums with the same value

Maybe there are columns that has the same value along all the vector, so are useless.

Define function to remove these columns.

different_values <- 
  function(x)
    x %>% na.omit() %>% unique() %>% length()
remove_columns_with_the_same_value <- 
  function(df)
      select_if(df, function(col) different_values(col) > 1)

Apply function.

num_of_cols_after_remove <- 
  df %>% 
  remove_columns_with_the_same_value() %>% 
  ncol()

Calculate the number of columns with same value.

ncol(df) - num_of_cols_after_remove
## [1] 147

Awesom! Many colums found. Let’s remove them.

df <- 
  remove_columns_with_the_same_value(df)

Inspecting dataframe

Now let’s deeply inspect into the dataframe.

View dataframe

View dataframe.

Renaming

The column “…1” is the row number, so “n” will be a better name. The “…JSON” it’s a bad name, just “json” is fine.

df <- 
  df %>% 
  rename(n = ...1, json=..JSON)

Removing cols

There are many duplicated cols, hashes & dates that can be removed, also many useless.

Dates

There are many dates in the dataset, that are not relevant for virus analysis. So let’s remove them.

Define a function for check if a col is of type Date.

not <- 
  function(x) !x

get_element <- 
  function(x, index) x[index]

is_date_col <- 
  function(col, pattern="^[:digit:]{4}[-:/][:digit:]{2}[-:/][:digit:]{2}")
    col %>% 
    as.character() %>% 
    na.omit() %>% 
    get_element(1) %>% 
    str_detect(pattern)

Columns detected.

df %>% 
  select_if(is_date_col) %>% 
  head() %>% 
  paged_table()

Define function for remove cols by a predicate.

remove_col_if <- 
  function(df, fun){
    cols_to_delete <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df <- 
      df %>% 
      select(-cols_to_delete)
    return(df)
  }

Remove them.

df <- 
  remove_col_if(df, is_date_col)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols_to_delete)` instead of `cols_to_delete` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

Hashes

There are many hashes cols that don’t really provide useful information. So remove them.

hashes <- 
  c("")

df <- 
  df %>% 
  select(
    -vhash,
    -sha256,
    -sha1,
    -scan_id,
    -ssdeep,
    -md5,
    -additional_info.androguard.certificate.serialnumber,
    -additional_info.androguard.certificate.thumbprint,
    -additional_info.exiftool.ZipCRC
  )

Scans

There are many scans of different antivirus, that has very similar information. Just keep the scan with less NA values.

Get the best col.

scans_col_names <- 
  df %>% 
  colnames() %>% 
  str_match_all("scans.[:alpha:]*.result") %>% 
  unlist()

scan_na_by_col <- 
  df %>% 
  select(all_of(scans_col_names)) %>%
  num_of_NA_by_column()

scan_col_witch_min_na <- 
  scan_na_by_col %>% 
  which.min()

best_scanner_colname <- 
  scan_na_by_col %>% 
  names() %>% 
  get_element(scan_col_witch_min_na)

best_scanner_col <- 
  df %>% 
  select(all_of(best_scanner_colname))

best_scanner_name <- 
  best_scanner_colname %>% 
  str_split("[.]") %>% 
  unlist() %>% 
  get_element(2)
## [1] "Best scan col: scans.Fortinet.result"
## [1] "Best scanner: Fortinet"

Drop all scans but the best.

col_index_scanners <- 
  df %>% 
  colnames() %>% 
  str_detect("scans") %>%
  unlist() %>% 
  which()

df <- 
  df %>% 
  select(-all_of(col_index_scanners)) %>% 
  cbind(best_scanner_col)

Individual columns

Reasons:

  • json column contains all the row as JSON.
  • permalink is the URL where Virus Total has the virus file.
  • Main.Activity & Package are strings with all different values.
  • FileTypeExtension, ZipFileName & MIMEType has same values as FileType.
  • ZipBitFlag doesn’t seems to matter.
  • additional_info.magic has the vesion of the ZIP file, that doesn’t seems to matter.
  • Subject.DN is the JSON fragment that has all the information about the subject, but these data are decomposed in the rest of Subject cols.
df <- 
  df %>% 
  select(
    -json,
    -permalink,
    -additional_info.androguard.AndroidApplicationInfo,
    -additional_info.androguard.Main.Activity,
    -additional_info.exiftool.MIMEType,
    -additional_info.exiftool.FileTypeExtension,
    -additional_info.exiftool.ZipFileName,
    -additional_info.magic,
    -additional_info.androguard.Package,
    -additional_info.androguard.certificate.Subject.DN,
    -additional_info.compressedview.uncompressed_size
  )

Groups of columns

Define a function to remove cols which name match a pattern.

remove_cols_which_name_match <- 
  function(df, pattern){
    cols_to_remove <- 
      df %>% 
      colnames() %>% 
      str_which(pattern)
    df_removed_cols <- 
      df %>% 
      select(-all_of(cols_to_remove))
    return(df_removed_cols)
  }

Remove groups.

Reasons:

  • Issuer group has the same information as Subject group.
  • CompressedView and RiskIndicator.APK groups have the same information as file_type group
df <- 
  df %>% 
  remove_cols_which_name_match("^additional_info.androguard.certificate.Issuer.[:alpha:]*$") %>% 
  remove_cols_which_name_match("^additional_info.compressedview.extensions.[:alpha:]*$") %>% 
  remove_cols_which_name_match("^additional_info.androguard.RiskIndicator.APK.[:alpha:]*$")

View results

Define a function for sort columns.

sort_cols <- 
  function(df){
    df <- 
      df %>% 
      select(order(colnames(df)))
    additionalInfo_cols_logical <- 
      df %>% 
      colnames() %>% 
      str_detect("additional_info")
    additionalInfo_cols <- 
      df %>% 
      select(which(additionalInfo_cols_logical))
    not_additionalInfo_cols <- 
      df %>% 
      select(which(!additionalInfo_cols_logical)) %>% 
      select(n, size, everything())
    return(cbind(not_additionalInfo_cols, additionalInfo_cols))
  }

Sort columns.

df <- 
  sort_cols(df)

View results.

Replacing values

Replace “Unknown” and “?” by NA

There are some columns that hast the value “Unknown” or “?” instead of NAs. So let’s replace them.

Define a function to replace values in cols that satisfy a predicate.

replace_when <- 
  function(df, fun, value, replacement){
    cols_to_replace <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace(col, which(col==value), replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }

Replace ? and Unknown for NAs.

df <- 
  df %>% 
  replace_when(function(col) any(str_detect(col, fixed("Unknown"))), "Unknown", NA) %>% 
  replace_when(function(col) any(str_detect(col, fixed("?"))), "?", NA)

Replace NA for 0

Define functions

Define functions for replace NAs.

replace_na_which_colname_match <- 
  function(df, pattern, replacement){
    cols_to_replace <- 
      df %>% 
      colnames() %>% 
      str_which(pattern)
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace_na(col, replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }
replace_na_when <- 
  function(df, fun, replacement){
    cols_to_replace <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace_na(col, replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }

Indiviual columns

The columns AndroidApplication and APK.SHARED.LIBRARIES shoud have 0s insteas of NAs.

df <- 
  df %>% 
  replace_na_which_colname_match("AndroidApplication", 0) %>% 
  replace_na_which_colname_match("APK.SHARED.LIBRARIES", 0) 

Groups of colums

The permissions (PERM) and the file types (file_types) groups of columns, seems that there are NAs where there should be 0s. So it would be better to replace them.

df <- 
  df %>% 
  replace_na_which_colname_match("PERM", 0) %>% 
  replace_na_which_colname_match("file_types", 0)

View results

Sort columns.

df <- 
  sort_cols(df)

View results.

New & modifiead colums

There are columns that must be only one, others provide more information by operating two columns, or summarises information.

New total permissions column

Create a new column that sums all permissions of permissions columns.

pattern <- "additional_info.androguard.RiskIndicator.PERM"

df_without_permissions <- 
  df %>% 
  select(., -(str_which(colnames(.), pattern)))

df_permissions <- 
  df %>% 
  select(., str_which(colnames(.), pattern)) %>% 
  mutate(., total_PERMs = rowSums(.))

df <- cbind(df_without_permissions, df_permissions)

Merge both MP3 columns into one

There are two mp3 extension columns, one for .MP3 and the other for .mp3, both are mp3 files. The others columns has the name of the extension in upper case, so let’s sum both into the MP3 column.

mp3_cols_logical <- 
  df %>% 
  colnames() %>% 
  tolower() %>% 
  str_detect("mp3")
mp3_upper_colname <- 
  colnames(df)[which(mp3_cols_logical)] %>% 
  str_match("^.*MP3$") %>% 
  unlist() %>% 
  na.omit()
sprintf("MP3 colname: %s", mp3_upper_colname)
## [1] "MP3 colname: additional_info.compressedview.file_types.MP3"

Sum them into additional_info.compressedview.file_types.MP3

mp3_col <- 
  df %>% 
  select(which(mp3_cols_logical)) %>% 
  rowSums(na.rm = TRUE)

df <- 
  df %>% 
  select(-which(mp3_cols_logical)) %>% 
  mutate(additional_info.compressedview.file_types.MP3 = mp3_col)

Express increased uncompressed size as percentage

There are two columns that refers to ZIP size, one for the compressed size and the other for the uncompressed size. It would be easier to compare them with the percentage of size increased after decompressed it.

increased_size_after_unzip <- 
  df$additional_info.exiftool.ZipUncompressedSize / 
  df$additional_info.exiftool.ZipCompressedSize

df <- 
  df %>% 
  select(-additional_info.exiftool.ZipUncompressedSize,
         -additional_info.exiftool.ZipCompressedSize) %>% 
  mutate(additional_info.exiftool.ZipIncreasedUncompressedSize = increased_size_after_unzip)

Suspicious heuristic as logical

The column additional_info.trendmicro.housecall.heuristic has a suspicious flag for some rows, the others are NA.

  1. So will be better to make this column logical:
  • NA -> FALSE
  • Suspicious -> True
  1. Also rename it to make it more understandable:
  • additional_info.trendmicro.housecall.heuristic -> additional_info.suspicious
suspicious <- 
  df$additional_info.trendmicro.housecall.heuristic %>% 
  is.na() %>% 
  not()

df <- 
  df %>% 
  select(-additional_info.trendmicro.housecall.heuristic) %>% 
  mutate(additional_info.suspicious = suspicious)

Decompose unpacker

he column additional_info.f.prot.unpacker has two possible values: “appended” and “UTF-8”, and when both happen they are combined into “appended, UTF-8”. So will be better to split this column into one logical for each of them.

Compute columns.

unpacker_appended <- 
  df$additional_info.f.prot.unpacker %>% 
  sapply(function(str) str_detect(str, "appended")) %>% 
  replace_na(FALSE)

unpacker_utf8 <- 
  df$additional_info.f.prot.unpacker %>% 
  sapply(function(str) str_detect(str, "UTF-8")) %>% 
  replace_na(FALSE)

Add them to dataframe.

df <- 
  df %>% 
  select(-additional_info.f.prot.unpacker) %>%
  mutate(additional_info.f.prot.unpacker.appended = unpacker_appended,
         additional_info.f.prot.unpacker.UTF8 = unpacker_utf8)

Decompose trid

The column additional_info.trid has percentages of archives.

df$additional_info.trid[1]
## [1] "Android Package (57.0%)\r\nJava Archive (20.0%)\r\nSweet Home 3D design (generic) (15.5%)\r\nZIP compressed archive (5.9%)\r\nPrintFox/Pagefox bitmap (640x800) (1.4%)"

Will be better to decompose this column into a column with the percentage for each value.

Extract values and percentages

trid_split_lines <- 
  df$additional_info.trid %>% 
  sapply(function(str) str_split(str, "\n"))
trid_split_lines[[3]]
## [1] "Java Archive (72.9%)\r"                  
## [2] "ZIP compressed archive (21.6%)\r"        
## [3] "PrintFox/Pagefox bitmap (640x800) (5.4%)"
trid_split_value_percentage <- 
  trid_split_lines %>%
  sapply(function(row) 
    sapply(row, function(str) 
      str_split(str, " \\(|%\\)")))
trid_split_value_percentage[[1]][[3]]
## [1] "Sweet Home 3D design" "generic)"             "15.5"                
## [4] "\r"
trid_values <- 
  trid_split_value_percentage %>% 
  sapply(function(row) 
    sapply(row, function(value_percentage) 
      get_element(value_percentage, 1)))
trid_values[[1]]
##                Android Package (57.0%)\r 
##                        "Android Package" 
##                   Java Archive (20.0%)\r 
##                           "Java Archive" 
## Sweet Home 3D design (generic) (15.5%)\r 
##                   "Sweet Home 3D design" 
##          ZIP compressed archive (5.9%)\r 
##                 "ZIP compressed archive" 
## PrintFox/Pagefox bitmap (640x800) (1.4%) 
##                "PrintFox/Pagefox bitmap"
trid_percentage <- 
  trid_split_value_percentage %>% 
  sapply(function(row) 
    sapply(row, function(value_percentage) 
      as.double(get_element(value_percentage, 
        length(value_percentage) - 1))))
trid_percentage[[1]]
##                Android Package (57.0%)\r 
##                                     57.0 
##                   Java Archive (20.0%)\r 
##                                     20.0 
## Sweet Home 3D design (generic) (15.5%)\r 
##                                     15.5 
##          ZIP compressed archive (5.9%)\r 
##                                      5.9 
## PrintFox/Pagefox bitmap (640x800) (1.4%) 
##                                      1.4

Define a function to convert the name to a proper trid colname.

trid_to_colname <- 
  function(name)
    name %>% 
      str_to_title() %>% 
      str_match_all("[:alpha:]*") %>% 
      unlist() %>% 
      paste0(collapse = " ") %>%
      str_remove_all(" ") %>% 
      sapply(function(name) paste("additional_info.trid", name, sep = "."))

Define function for rename the percentages.

rename_as_value <- 
  function(v){
    values_names <- 
      names(v) %>% 
      sapply(function(name) str_split(name, " \\(")) %>% 
      sapply(function(name_split) get_element(name_split, 1)) %>% 
      sapply(trid_to_colname)
    names(v) <- values_names
    return(v)
  }
trid_percentage[[1]] %>% 
  rename_as_value()
##        additional_info.trid.AndroidPackage 
##                                       57.0 
##           additional_info.trid.JavaArchive 
##                                       20.0 
##      additional_info.trid.SweetHomedDesign 
##                                       15.5 
##  additional_info.trid.ZipCompressedArchive 
##                                        5.9 
## additional_info.trid.PrintfoxPagefoxBitmap 
##                                        1.4

Rename the percentages.

trid_percentage_names_as_value <- 
  trid_percentage %>% 
  sapply(rename_as_value)
trid_percentage_names_as_value[[1]]
##        additional_info.trid.AndroidPackage 
##                                       57.0 
##           additional_info.trid.JavaArchive 
##                                       20.0 
##      additional_info.trid.SweetHomedDesign 
##                                       15.5 
##  additional_info.trid.ZipCompressedArchive 
##                                        5.9 
## additional_info.trid.PrintfoxPagefoxBitmap 
##                                        1.4

Create columns

Get columns names.

trid_labels <- 
  trid_values %>% 
  unlist() %>% 
  unique()
trid_labels
##  [1] "Android Package"                   "Java Archive"                     
##  [3] "Sweet Home 3D design"              "ZIP compressed archive"           
##  [5] "PrintFox/Pagefox bitmap"           "Opera Widget"                     
##  [7] "Mozilla Archive Format"            "Dalvik Dex class"                 
##  [9] "OpenOffice Extension"              "VYM Mind Map"                     
## [11] "Mozilla Firefox browser extension" "Konfabulator widget"
trid_values_colnames <- 
  trid_labels %>% 
  sapply(trid_to_colname)
trid_values_colnames
##                                   Android Package.AndroidPackage 
##                            "additional_info.trid.AndroidPackage" 
##                                         Java Archive.JavaArchive 
##                               "additional_info.trid.JavaArchive" 
##                            Sweet Home 3D design.SweetHomedDesign 
##                          "additional_info.trid.SweetHomedDesign" 
##                      ZIP compressed archive.ZipCompressedArchive 
##                      "additional_info.trid.ZipCompressedArchive" 
##                    PrintFox/Pagefox bitmap.PrintfoxPagefoxBitmap 
##                     "additional_info.trid.PrintfoxPagefoxBitmap" 
##                                         Opera Widget.OperaWidget 
##                               "additional_info.trid.OperaWidget" 
##                      Mozilla Archive Format.MozillaArchiveFormat 
##                      "additional_info.trid.MozillaArchiveFormat" 
##                                  Dalvik Dex class.DalvikDexClass 
##                            "additional_info.trid.DalvikDexClass" 
##                         OpenOffice Extension.OpenofficeExtension 
##                       "additional_info.trid.OpenofficeExtension" 
##                                          VYM Mind Map.VymMindMap 
##                                "additional_info.trid.VymMindMap" 
## Mozilla Firefox browser extension.MozillaFirefoxBrowserExtension 
##            "additional_info.trid.MozillaFirefoxBrowserExtension" 
##                           Konfabulator widget.KonfabulatorWidget 
##                        "additional_info.trid.KonfabulatorWidget"

Create an empty tibble with colnames set.

df_trid <- 
  matrix(nrow = nrow(df), 
         ncol = length(trid_values_colnames)) %>% 
  as_tibble(.name_repair = ~ trid_values_colnames) %>% 
  mutate_each(as.double)

Colnames.

df_trid %>% 
  colnames()
##  [1] "additional_info.trid.AndroidPackage"                
##  [2] "additional_info.trid.JavaArchive"                   
##  [3] "additional_info.trid.SweetHomedDesign"              
##  [4] "additional_info.trid.ZipCompressedArchive"          
##  [5] "additional_info.trid.PrintfoxPagefoxBitmap"         
##  [6] "additional_info.trid.OperaWidget"                   
##  [7] "additional_info.trid.MozillaArchiveFormat"          
##  [8] "additional_info.trid.DalvikDexClass"                
##  [9] "additional_info.trid.OpenofficeExtension"           
## [10] "additional_info.trid.VymMindMap"                    
## [11] "additional_info.trid.MozillaFirefoxBrowserExtension"
## [12] "additional_info.trid.KonfabulatorWidget"

Types.

df_trid %>% 
  sapply(is.double) %>% 
  sum() == ncol(df_trid)
## [1] TRUE

Insert values.

for(row_index in 1:nrow(df)){
  row <- trid_percentage_names_as_value[[row_index]]
  for(percentage_index in 1:length(row)){
    percentage <- row[percentage_index]
    colname <- names(percentage)
    df_trid[row_index, colname] <- percentage
  }
}

Replace NA for 0.

df_trid <- 
  df_trid %>% 
  replace(is.na(.), 0)

Merge dataframes

Finally merge df and df_trid into one.

df <- 
  df %>% 
  select(-additional_info.trid) %>% 
  cbind(df_trid)

View results

Sort columns.

df <- 
  sort_cols(df)

View results.

Save dataframe

After all preprocessing let’s save it into CSV.

write.csv(df, path_export)

Functions for preprocessing

As factor

labels <- 
  function(n){
    if(n == 5){
      return(c("very low", "low", "medium", "high", "very high"))
    }else if(n == 4){
      return(c("very low", "low", "high", "very high"))
    }else if(n == 3){
      return(c("low", "medium", "high"))
    }else if(n == 2){
      return(c("low", "high"))
    }else{
      stop("Not avalible")
    }
  }

cut_by_quantiles <- 
  function(col){
    quantiles <- 
      col %>% 
      quantile(na.rm = TRUE) %>% 
      unique()
    if(length(quantiles) > 2){
      col <- 
        col %>% 
        cut(breaks = quantiles, 
            labels = labels(length(quantiles)-1),
            include.lowest = TRUE)
    }
    return(col)
  }

df_cut_by_quantiles <- 
  function(df){
    df_without_numeric <- 
      df[sapply(df, function(col) !is.numeric(col))]
    df_numeric <- 
      df %>% 
      select_if(is.numeric)
    df_numeric <- 
      df_numeric %>% 
      lapply(cut_by_quantiles)
    return(cbind(df_without_numeric, df_numeric))
  }